
#######################################################
#  Program Associated With "Patenting Video Gamplay"  #
# 	              Gregory Schwartz 	    			  #
#######################################################

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import progressbar 

#####################
# Create Classifier #
#####################

dataframe1 = pd.read_excel('data.xlsx', 'Labeled Sample')
dataframe1 = dataframe1.dropna(subset=["Manual Label"])

print(dataframe1.info())

labels = []
paragraphs = []
for index, row in dataframe1.iterrows():

	CPC = row['CPC'].replace(" ","").replace("\n"," ")
	Title = " ".join(word + "*" for word in row['Title'].split()) # Append "*" to signify title words.
	Owner = str(row['Ultimate Owner']).replace(" ","").replace("\n","")

	y = row['Manual Label']

	labels.append(y)
	paragraphs.append(Title+" "+str(row['Abstract'])+" "+CPC+" "+Owner)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(paragraphs)


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.5)

sm = SMOTE(random_state=42, k_neighbors=5)
X_res, labels_res = sm.fit_resample(X_train, y_train)


model = MultinomialNB()
model.fit(X_res, labels_res)

y_pred = model.predict(X_test)


print("Classification Report:")
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(classification_report(y_test, y_pred))
print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (FN): {tn}")
print(f"False Negatives (FN): {fn}")


##################
# Label All Data #
##################

dataframe2 = pd.read_excel('data.xlsx', 'Data')
dataframe2 = dataframe2.loc[:, ~dataframe2.columns.str.contains('^Unnamed')]

print(dataframe2.head())

paragraphs = []
bar = progressbar.ProgressBar(maxval=dataframe2.shape[0]).start()
for index, row in dataframe2.iterrows():
	CPC = row['CPC'].replace(" ","").replace("\n"," ")
	Title = " ".join(word + "618" for word in row['Title'].split())

	paragraph = Title+" "+str(row['Abstract'])+" "+CPC

	paragraphs.append(paragraph)

	bar.update(index)

print("Making predictions")
print()
X = vectorizer.transform(paragraphs)
predictions = model.predict(X)

output = []

bar2 = progressbar.ProgressBar(maxval=dataframe2.shape[0]).start()
for index, row in dataframe2.iterrows():
	output.append([predictions[index], row['Publication Number'], row['Application/Filing Date'], row['CPC'], row['Ultimate Owner']])
	bar2.update(index)

df = pd.DataFrame(output)
df.to_excel("Output.xlsx")

